Load the necessary R libraries

#devtools::install_github("GuangchuangYu/ggtree")
library(treeio)
library(ggtree)
library(phytools)
library(viridisLite)
library(ggplot2)
library(googlesheets4)
library(dplyr)

Load data from google spreadsheet (Already stored in an R object that you can just load)

#id<-"1e7Hj_BQ8rke5XW7j93YL6QVAGbcvsb2yw7oD86WnPJY"
#gsheet<-read_sheet(id,sheet="GlobalOverview",skip = 2,na = c(""," ","NA","#DIV/0!"),col_types = "c",trim_ws=T)
#GlobalOverview<-as.data.frame(gsheet,stringsAsFactors=F)
#save(GlobalOverview,file="../../misc/data/gsheet.GlobalOverview.21-09-22.Robj")
load("/Users/Janina/Documents/MASTER/Masterarbeit/lgt/1_code/r_scripts/gsheet.GlobalOverview.21-09-22.Robj")

Create a dataframe where the first row is the GAGA id (e.g. GAGA-0002), which are also used as the tip labels in the phylogeny

data2add<-data.frame(GAGA.id = GlobalOverview$`GAGA ID`, GlobalOverview)

Create a column with clean species IDs for all species.

This has to be done, because the database of all the sequenced species is a bit convoluted.

data2add$cleanSpeciesID<-dplyr::coalesce(data2add$Species,data2add$Species..from.Assembly.Overview.)

Load the tree file of all the GAGA genomes.

tree <- read.iqtree("/Users/Janina/Documents/MASTER/Masterarbeit/lgt/1_code/r_scripts/hymenoptera_psc98_partition_b1000_40threads_msubnucl.treefile")

Create a very quick and dirty plot of the phylogeny

plot(tree@phylo,show.node.label = T,show.tip.label = T,cex=.5)
nodelabels(bg=rgb(0,0,0,0),col="red",frame = "none")

check the plot to see where the root should be. Here we root the tree at node 166 that separates the outgroups, the honeybee “Amel” (Apis mellifera) and the parasitoid wasp Nasonia (Nvit), from ants.

#tree.rooted<-treeio::root(tree,node=166)
tree.rooted <- tree
tree.rooted.subset<-treeio::drop.tip(tree.rooted,tip=c("NCBI-0018_Amel","NCBI-0019_Nvit"))

Plot basic tree

tp<-ggtree(tree.rooted.subset,ladderize = T, layout = "circular")+geom_rootedge(.01)

Add the species identity from FAS.genes.and.names.tsv

tp$data$label[tp$data$isTip==T]<-gsub("_.*","",tp$data$label[tp$data$isTip==T])

tp2<-tp %<+% data2add
tp2

1 A) Make complex tree with the subfamilies highlighted by color

#install.packages("ggnewscale")
library("ggnewscale")

#Make Subfamily as a factor (discrete color scale for factors)
tp3$data$Subfamily<-as.factor(tp3$data$Subfamily)

tp3<-tp2 + 
  geom_tiplab2(aes(label=paste(label,cleanSpeciesID,sep=" "), color = Subfamily),size=2,align = T,linesize = .1)+    #add the tip labels
  geom_nodepoint(aes(fill=UFboot),stroke=0.5,pch=21,size=2)+       #add the info about bootstrapping success as points of different colors to the nodes
 scale_fill_viridis_c()+  #change the color of the UFboot node points to a viridis color scale
  theme(legend.position = "left",
        legend.title=element_text(size=9), # the title size of legend.
        legend.text=element_text(size=7)
        )
tp3 #plot the tree

ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/Subfamilies.pdf",
width = 15,height=30) #save the tree in a pdf

Gather subfamily info for NCBI IDs as well

tp3$data[is.na(tp3$data$Subfamily) & tp3$data$isTip==T,]

2 A.1) LGT data: Include the LGT data in the circular phylogeny

#Add the raw LGT dataframe 
library(readxl)
LGTs.raw<-read_excel("/Users/Janina/Documents/MASTER/Masterarbeit/lgt/0_data/GAGA.LGT.162.resulting.good.candidates.xlsx")
## New names:
## * `` -> ...27
## * `` -> ...28
## * `` -> ...29
LGTs.raw$...27 <- NULL
LGTs.raw$...28 <- NULL
LGTs.raw$...29 <- NULL
LGTs.raw$comment <- NULL

#Clean up the LGT dataframe to remove the database origin (GAGA-0001.euk -> GAGA-0001)
LGTs<-LGTs.raw
LGTs$GAGAid<-gsub("\\..*","",LGTs$.id)

3 A.2) LGT data: Add the LGT candidate numbers per genome to the phylogeny

Plot the circular phylogeny and add red dot to the tip, where the size of the red dot represents the number of identified LGTs.

#Create a tally dataframe that counts how many LGT candidates we have per species. 
LGT.tally<-as.data.frame(table(select(LGTs,GAGAid)))
colnames(LGT.tally)<-c("GAGAid","LGT.count")

#Add the LGT.tally dataframe to the phylogeny object as additional data.
tp4<-tp3 %<+% LGT.tally
#see tp4$data$LGT.count

#Plot the phylogeny and add red dot to the tip, where the size of the red dot represents the number of identified LGTs
tp5<-tp4+geom_tippoint(aes(size=LGT.count),col="red", alpha=.5)
tp5
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).

ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/GAGA.phylogeny.LGTs.pdf",width = 12,height=24) #save the tree in a pdf
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).

4 A.3) LGT data: Add the different LGT categories to the phylogeny as a heatmap

Create a dataframe that counts the genomes with ankyrin repeats

#Only select the ankyrins to show them as a heatmap
LGT.ankyrins<-subset(LGTs,predicted_protein=="ankyrin repeat protein") %>% group_by(GAGAid) %>% tally()
colnames(LGT.ankyrins)<-c("GAGAid","protein")

LGT.ankyrins$type<-"ankyrin repeat protein"
knitr::kable(head(LGT.ankyrins))
GAGAid protein type
GAGA-0020 9 ankyrin repeat protein
GAGA-0024 7 ankyrin repeat protein
GAGA-0028 27 ankyrin repeat protein
GAGA-0063 2 ankyrin repeat protein
GAGA-0087 31 ankyrin repeat protein
GAGA-0098 2 ankyrin repeat protein
#Only select the ankyrins and lysozymes to show them on the phylogeny
LGT.anks.lys<-subset(LGTs,predicted_protein %in% c("ankyrin repeat protein", "Lysozyme")) %>% group_by(GAGAid) 

LGT.anks.lys<-LGT.anks.lys[,c(24,26)]
LGT.anks.lys <- LGT.anks.lys %>% count(GAGAid,predicted_protein, sort = T)
colnames(LGT.anks.lys)<-c("GAGAid","protein","count")

knitr::kable(head(LGT.anks.lys))
GAGAid protein count
GAGA-0087 ankyrin repeat protein 31
GAGA-0028 ankyrin repeat protein 27
GAGA-0401 ankyrin repeat protein 16
GAGA-0020 ankyrin repeat protein 9
GAGA-0364 ankyrin repeat protein 8
GAGA-0024 ankyrin repeat protein 7

Plot the phylogeny and add ankyrin repeat proteins and lysozymes as a heatmap layer

library(ggtree)
library(ggtreeExtra)
library(ggplot2)
library(ggnewscale)
library(dplyr)
library(tidytree)
library(ggstar)
# For the bar layer outside of the tree
tp5.5 <- tp5 +
  new_scale_fill() +
  geom_fruit(
          data=LGT.anks.lys,
          geom=geom_bar,
          mapping=aes(y=GAGAid, x=count,fill=protein), #The count will be mapped to x
          pwidth=1,      # Width of the external layer
          stat="identity",
          orientation="y", #The orientation of the axis
          position=position_stackx(hexpand=.7),   #Adjust the position of the geom_layer
          axis.params =list(
                          axis="x",       # add axis text of the layer.
                          vjust=0.2,
                          text.angle=0,   # the text size of axis.
                          text.size=1.5,
                          hjust=1,        # adjust the horizontal position of text of axis
                          nbreak=2,       # number of lines within geom_layer, 
                          ),
          grid.params=list(alpha=.3)      # Parameter to adjust gridlines around the tree
      ) +
      scale_fill_manual(
        values=c("#339933","#dfac03"),   #color of the external geom_layer
        name="Protein",        #name of the legend for the external geom_layer
        guide=guide_legend(keywidth=0.9,keyheight=0.9,order=6)
      ) +
      theme(legend.position=c(0.96, 0.5), # the position of legend.
          legend.background=element_rect(fill=NA), # the background of legend.
          legend.title=element_text(size=9), # the title size of legend.
          legend.text=element_text(size=7), # the text size of legend.
          legend.spacing.y = unit(0.02, "cm") 
      )
tp5.5
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).

ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/GAGA.phylogeny.anks.lys.pdf",width=13,height=26) #save the tree in a pdf
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).

5 A.4) Plot all predicted proteins onto the phylogeny

LGT.proteins<-LGTs[,c(24,26)]
LGT.proteins <- LGT.proteins %>% count(GAGAid,predicted_protein, sort = T)
colnames(LGT.proteins)<-c("GAGAid","protein","count")

LGT.proteins <- na.omit(LGT.proteins)
knitr::kable(head(LGT.proteins)) 
GAGAid protein count
GAGA-0087 ankyrin repeat protein 31
GAGA-0028 ankyrin repeat protein 27
GAGA-0401 ankyrin repeat protein 16
GAGA-0020 ankyrin repeat protein 9
GAGA-0364 ankyrin repeat protein 8
GAGA-0024 ankyrin repeat protein 7
library("viridis")
# For the bar layer outside of the tree
tp5.5 <- tp5 +
  new_scale_fill() +
  geom_fruit(
          data=LGT.proteins,
          geom=geom_bar,
          mapping=aes(y=GAGAid, x=count,fill=protein), #The count will be mapped to x
          pwidth=1,      # Width of the external layer
          stat="identity",
          orientation="y", #The orientation of the axis
          position=position_stackx(hexpand=.7)   #Adjust the position of the geom_layer
         # axis.params =list(
                         # axis="x",       # add axis text of the layer.
                         # vjust=0.2,
                         # text.angle=0,   # the text size of axis.
                         # text.size=1.5,
                         # hjust=1,        # adjust the horizontal position of text of axis
                         # nbreak=2,       # number of lines within geom_layer, 
                         # ),
        #  grid.params=list(alpha=.5)      # Parameter to adjust gridlines around the tree
      ) +
      scale_fill_discrete(     #color of the external geom_layer
       name="Protein",        #name of the legend for the external geom_layer
       guide=guide_legend(keywidth=0.9,keyheight=0.9,order=6, ncol=1)
      ) +
      theme(legend.position=c(0.1, 0.5), # the position of legend.
          legend.background=element_rect(fill=NA), # the background of legend.
          legend.title=element_text(size=9), # the title size of legend.
          legend.text=element_text(size=7), # the text size of legend.
          legend.spacing.y = unit(0.02, "cm") 
      )
tp5.5
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).

ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/GAGA.phylogeny.proteins.pdf",width=13,height=26) #save the tree in a pdf
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).

6 A.5) Subset the LGT count into Wolbachia and Blochmannia origin

LGT.bacteria<-LGTs[,c(20,26)]
LGT.bacteria <- LGT.bacteria %>% count(GAGAid,suggested_prokaryote_origin, sort = T)
colnames(LGT.bacteria)<-c("GAGAid","prokaryote_origin","count")

LGT.bacteria <- na.omit(LGT.bacteria)
LGT.bacteria.WB.BM <- subset(LGT.bacteria,prokaryote_origin %in% c("Wolbachia", "Blochmannia")) %>% group_by(GAGAid) 
knitr::kable(head(LGT.bacteria.WB.BM)) 
GAGAid prokaryote_origin count
GAGA-0028 Wolbachia 34
GAGA-0087 Wolbachia 33
GAGA-0401 Wolbachia 21
GAGA-0222 Wolbachia 11
GAGA-0223 Wolbachia 11
GAGA-0020 Wolbachia 10

Add the Wolbachia and Blochmannia origin

tp6 <- tp5.5 +
  new_scale_fill() +
  geom_fruit(
          data=LGT.bacteria.WB.BM,
          geom=geom_bar,
          mapping=aes(y=GAGAid, x=count,fill=prokaryote_origin), #The count will be mapped to x
          pwidth=1,      # Width of the external layer
          stat="identity",
          orientation="y", #The orientation of the axis
          #position=position_stackx(hexpand=.7)#,   #Adjust the position of the geom_layer
          #axis.params =list(
                          #axis="x",       # add axis text of the layer.
                         # vjust=0.2,
                          #text.angle=0,   # the text size of axis.
                          #text.size=1.5,
                         # hjust=1,        # adjust the horizontal position of text of axis
                         # nbreak=2,       # number of lines within geom_layer, 
                         # ),
         # grid.params=list(alpha=.4)      # Parameter to adjust gridlines around the tree
      ) +
      scale_fill_manual(
       values=c("black","grey"),          #color of the external geom_layer
       name="Prokaryote origin",        #name of the legend for the external geom_layer
       guide=guide_legend(keywidth=0.9,keyheight=0.9,order=6, ncol=1)
      ) +
      theme(legend.position=c(0.07, 0.5), # the position of legend.
          legend.background=element_rect(fill=NA), # the background of legend.
          legend.title=element_text(size=9), # the title size of legend.
          legend.text=element_text(size=7), # the text size of legend.
          legend.spacing.y = unit(0.02, "cm") 
      )
tp6
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).

ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/GAGA.phylogeny.WB.BM.pdf",width=15,height=30) #save the tree in a pdf
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).